notebook.community

Edit and run



In [ ]:



In [ ]:

    
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/RoadRatings2015.csv



In [ ]:

    
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/potholes.csv



In [ ]:

    
!pwd



In [ ]:

    
roadratings_df = sqlContext.read\
    .format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("nullValue", 'NA')\
    .load("file:///home/ischool/spark_demo/RoadRatings2015.csv")



In [ ]:

    
potholes_df = sqlContext.read\
    .format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option('nullValue', 'NA')\
    .load("file:///home/ischool/spark_demo/potholes.csv")



In [ ]:

    
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline



In [ ]:

    
rr_df = roadratings_df.select(['streetID', 'overall', 'crack', 'patch', 'length', 'width'])



In [ ]:

    
ph_df = potholes_df.select(['STREET_ID', 'Latitude', 'Longitude'])



In [ ]:

    
from pyspark.sql import functions



In [ ]:

    
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).show()

Activity compute the average number of potholes in the city



In [ ]:

    
## Your code here



In [ ]:

    
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).\
    join(potholes_df.select(['STREET_ID', 'strLocation']), 'STREET_ID').show()



In [ ]:

    
rr_df.columns



In [ ]:

    
dataset_df = rr_df.join(ph_df, rr_df['streetID'] == ph_df['STREET_ID'])



In [ ]:

    
dataset_df.printSchema()



In [ ]:

    
lr = LinearRegression(featuresCol='features', labelCol='overall')



In [ ]:

    
va = VectorAssembler(inputCols=['length'], outputCol='features')



In [ ]:

    
pl = Pipeline(stages=[va, lr])



In [ ]:

    
input_dataset_df = dataset_df.select(dataset_df['length'].astype('double').alias('length'),
                         dataset_df['overall'].astype('double').alias('overall')
                        ).dropna()



In [ ]:

    
training_df, testing_df = input_dataset_df.randomSplit([0.8, 0.2])



In [ ]:

    
training_df.count()



In [ ]:

    
testing_df.count()



In [ ]:

    
pl_fit = pl.fit(training_df)



In [ ]:

    
pl_fit.transform(training_df).show()



In [ ]:

    
training_df.count()



In [ ]:

    
testing_df.count()



In [ ]:

    
pl_fit.transform(testing_df).show()

Activity: Compute MSE



In [ ]:

Activity: Compare models with different feature sets (length + width + Latitude + Longitude)



In [ ]: